import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
from pandas_profiling import ProfileReport
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
movies_df=pd.read_csv("movies_metadata.csv")
ratings_df=pd.read_csv("ratings.csv")
C:\Users\jpras\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3146: DtypeWarning: Columns (10) have mixed types.Specify dtype option on import or set low_memory=False.
movies_df.head()
| adult | belongs_to_collection | budget | genres | homepage | id | imdb_id | original_language | original_title | overview | ... | release_date | revenue | runtime | spoken_languages | status | tagline | title | video | vote_average | vote_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | False | {'id': 10194, 'name': 'Toy Story Collection', ... | 30000000 | [{'id': 16, 'name': 'Animation'}, {'id': 35, '... | http://toystory.disney.com/toy-story | 862 | tt0114709 | en | Toy Story | Led by Woody, Andy's toys live happily in his ... | ... | 1995-10-30 | 373554033.0 | 81.0 | [{'iso_639_1': 'en', 'name': 'English'}] | Released | NaN | Toy Story | False | 7.7 | 5415.0 |
| 1 | False | NaN | 65000000 | [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... | NaN | 8844 | tt0113497 | en | Jumanji | When siblings Judy and Peter discover an encha... | ... | 1995-12-15 | 262797249.0 | 104.0 | [{'iso_639_1': 'en', 'name': 'English'}, {'iso... | Released | Roll the dice and unleash the excitement! | Jumanji | False | 6.9 | 2413.0 |
| 2 | False | {'id': 119050, 'name': 'Grumpy Old Men Collect... | 0 | [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... | NaN | 15602 | tt0113228 | en | Grumpier Old Men | A family wedding reignites the ancient feud be... | ... | 1995-12-22 | 0.0 | 101.0 | [{'iso_639_1': 'en', 'name': 'English'}] | Released | Still Yelling. Still Fighting. Still Ready for... | Grumpier Old Men | False | 6.5 | 92.0 |
| 3 | False | NaN | 16000000 | [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... | NaN | 31357 | tt0114885 | en | Waiting to Exhale | Cheated on, mistreated and stepped on, the wom... | ... | 1995-12-22 | 81452156.0 | 127.0 | [{'iso_639_1': 'en', 'name': 'English'}] | Released | Friends are the people who let you be yourself... | Waiting to Exhale | False | 6.1 | 34.0 |
| 4 | False | {'id': 96871, 'name': 'Father of the Bride Col... | 0 | [{'id': 35, 'name': 'Comedy'}] | NaN | 11862 | tt0113041 | en | Father of the Bride Part II | Just when George Banks has recovered from his ... | ... | 1995-02-10 | 76578911.0 | 106.0 | [{'iso_639_1': 'en', 'name': 'English'}] | Released | Just When His World Is Back To Normal... He's ... | Father of the Bride Part II | False | 5.7 | 173.0 |
5 rows × 24 columns
ratings_df.head()
| userId | movieId | rating | timestamp | |
|---|---|---|---|---|
| 0 | 1 | 110 | 1.0 | 1425941529 |
| 1 | 1 | 147 | 4.5 | 1425942435 |
| 2 | 1 | 858 | 5.0 | 1425941523 |
| 3 | 1 | 1221 | 5.0 | 1425941546 |
| 4 | 1 | 1246 | 5.0 | 1425941556 |
movies_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 45466 entries, 0 to 45465 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 adult 45466 non-null object 1 belongs_to_collection 4494 non-null object 2 budget 45466 non-null object 3 genres 45466 non-null object 4 homepage 7782 non-null object 5 id 45466 non-null object 6 imdb_id 45449 non-null object 7 original_language 45455 non-null object 8 original_title 45466 non-null object 9 overview 44512 non-null object 10 popularity 45461 non-null object 11 poster_path 45080 non-null object 12 production_companies 45463 non-null object 13 production_countries 45463 non-null object 14 release_date 45379 non-null object 15 revenue 45460 non-null float64 16 runtime 45203 non-null float64 17 spoken_languages 45460 non-null object 18 status 45379 non-null object 19 tagline 20412 non-null object 20 title 45460 non-null object 21 video 45460 non-null object 22 vote_average 45460 non-null float64 23 vote_count 45460 non-null float64 dtypes: float64(4), object(20) memory usage: 8.3+ MB
ratings_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 26024289 entries, 0 to 26024288 Data columns (total 4 columns): # Column Dtype --- ------ ----- 0 userId int64 1 movieId int64 2 rating float64 3 timestamp int64 dtypes: float64(1), int64(3) memory usage: 794.2 MB
movies_df.describe()
| revenue | runtime | vote_average | vote_count | |
|---|---|---|---|---|
| count | 4.546000e+04 | 45203.000000 | 45460.000000 | 45460.000000 |
| mean | 1.120935e+07 | 94.128199 | 5.618207 | 109.897338 |
| std | 6.433225e+07 | 38.407810 | 1.924216 | 491.310374 |
| min | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.000000e+00 | 85.000000 | 5.000000 | 3.000000 |
| 50% | 0.000000e+00 | 95.000000 | 6.000000 | 10.000000 |
| 75% | 0.000000e+00 | 107.000000 | 6.800000 | 34.000000 |
| max | 2.787965e+09 | 1256.000000 | 10.000000 | 14075.000000 |
ratings_df.rating.describe()
count 2.602429e+07 mean 3.528090e+00 std 1.065443e+00 min 5.000000e-01 25% 3.000000e+00 50% 3.500000e+00 75% 4.000000e+00 max 5.000000e+00 Name: rating, dtype: float64
ratings_df.isna().sum()
userId 0 movieId 0 rating 0 timestamp 0 dtype: int64
movies_df.isna().sum()
adult 0 belongs_to_collection 40972 budget 0 genres 0 homepage 37684 id 0 imdb_id 17 original_language 11 original_title 0 overview 954 popularity 5 poster_path 386 production_companies 3 production_countries 3 release_date 87 revenue 6 runtime 263 spoken_languages 6 status 87 tagline 25054 title 6 video 6 vote_average 6 vote_count 6 dtype: int64
ratings_df.isna().sum()
userId 0 movieId 0 rating 0 timestamp 0 dtype: int64
#movies_df.drop(['belongs_to_collection', 'homepage', 'tagline', 'poster_path', 'overview', 'spoken_languages'], inplace=True, axis=1)
movies_df.isna().sum()
adult 0 belongs_to_collection 40972 budget 0 genres 0 homepage 37684 id 0 imdb_id 17 original_language 11 original_title 0 overview 954 popularity 5 poster_path 386 production_companies 3 production_countries 3 release_date 87 revenue 6 runtime 263 spoken_languages 6 status 87 tagline 25054 title 6 video 6 vote_average 6 vote_count 6 dtype: int64
movies_df.columns
Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
'imdb_id', 'original_language', 'original_title', 'overview',
'popularity', 'poster_path', 'production_companies',
'production_countries', 'release_date', 'revenue', 'runtime',
'spoken_languages', 'status', 'tagline', 'title', 'video',
'vote_average', 'vote_count'],
dtype='object')
movies_df['revenue'] = movies_df['revenue'].replace(0,np.nan)
movies_df['budget'] = pd.to_numeric(movies_df['budget'], errors='coerce')
movies_df['budget'] = movies_df['budget'].replace(0, np.nan)
movies_df[movies_df['budget'].isnull()].shape
(36576, 24)
movies_df['return'] = movies_df['revenue']/movies_df['budget']
movies_df['release_date'] = pd.to_datetime(movies_df['release_date'], errors='coerce')
movies_df['year'] = pd.DatetimeIndex(movies_df['release_date']).year
movies_df['month'] = movies_df['release_date'].dt.month_name()
x = movies_df.apply(lambda x: pd.Series(x['production_countries']),axis=1).stack().reset_index(level=1, drop=True)
x.name = 'countries' #if we dont do this than we get error while joining as column wont have any name
country_data = movies_df.drop('production_countries', axis=1).join(x)
country_data = pd.DataFrame(country_data['countries'].value_counts())
country_data['country'] = country_data.index
country_data.columns = ['num_movies', 'country']
country_data = country_data.reset_index().drop('index', axis=1)
country_data.head(10)
| num_movies | country | |
|---|---|---|
| 0 | 17851 | [{'iso_3166_1': 'US', 'name': 'United States o... |
| 1 | 6282 | [] |
| 2 | 2238 | [{'iso_3166_1': 'GB', 'name': 'United Kingdom'}] |
| 3 | 1654 | [{'iso_3166_1': 'FR', 'name': 'France'}] |
| 4 | 1356 | [{'iso_3166_1': 'JP', 'name': 'Japan'}] |
| 5 | 1030 | [{'iso_3166_1': 'IT', 'name': 'Italy'}] |
| 6 | 840 | [{'iso_3166_1': 'CA', 'name': 'Canada'}] |
| 7 | 749 | [{'iso_3166_1': 'DE', 'name': 'Germany'}] |
| 8 | 735 | [{'iso_3166_1': 'RU', 'name': 'Russia'}] |
| 9 | 735 | [{'iso_3166_1': 'IN', 'name': 'India'}] |
country_dada = country_data[country_data['country'] != 'UK']
data = dict(type = 'choropleth',
locations = country_data['country'],
colorscale = 'Portland',
locationmode = 'country names',
z = country_data['num_movies'],
text = country_data['country'],
marker = dict(line = dict(color = 'rgb(255,255,255)',width=1)),
colorbar = {'title' : 'No of movies per country'})
layout = dict(title = 'Countries where movies is directed',
geo = dict(showframe= False,
projection = {'type' : 'mercator'}))
choromaps = go.Figure(data=[data], layout=layout)
iplot(choromaps)
fran_data = movies_df[movies_df['belongs_to_collection'].notnull()]
fran_data['belongs_to_collection'] = fran_data['belongs_to_collection'].apply(ast.literal_eval).apply(lambda x: x['name'] if isinstance(x, dict) else np.nan)
<ipython-input-33-402f2c5d7dca>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
fran_data = fran_data[fran_data['belongs_to_collection'].notnull()]
pivotfran_data = pd.pivot_table(fran_data,
index = 'belongs_to_collection',
values='revenue',
aggfunc={'revenue':['count','sum']}).reset_index()
pivotfran_data.sort_values
<bound method DataFrame.sort_values of belongs_to_collection count sum 0 ... Has Fallen Collection 2 366780087.0 1 00 Schneider Filmreihe 0 0.0 2 08/15 Collection 0 0.0 3 100 Girls Collection 0 0.0 4 101 Dalmatians (Animated) Collection 1 215880014.0 ... ... ... ... 1690 Сказки Чуковского 0 0.0 1691 Чебурашка и крокодил Гена 0 0.0 1692 Что Творят мужчины! (Коллекция) 0 0.0 1693 男はつらいよ シリーズ 0 0.0 1694 식객 시리즈 0 0.0 [1695 rows x 3 columns]>
pivotfran_data.sort_values('sum', ascending=False).head(10)
| belongs_to_collection | count | sum | |
|---|---|---|---|
| 552 | Harry Potter Collection | 8 | 7.707367e+09 |
| 1160 | Star Wars Collection | 8 | 7.434495e+09 |
| 646 | James Bond Collection | 26 | 7.106970e+09 |
| 1317 | The Fast and the Furious Collection | 8 | 5.125099e+09 |
| 968 | Pirates of the Caribbean Collection | 5 | 4.521577e+09 |
| 1550 | Transformers Collection | 5 | 4.366101e+09 |
| 325 | Despicable Me Collection | 4 | 3.691070e+09 |
| 1491 | The Twilight Collection | 5 | 3.342107e+09 |
| 610 | Ice Age Collection | 5 | 3.216709e+09 |
| 666 | Jurassic Park Collection | 4 | 3.031484e+09 |
pivotfran_data.sort_values('count', ascending=False).head(10)
| belongs_to_collection | count | sum | |
|---|---|---|---|
| 646 | James Bond Collection | 26 | 7.106970e+09 |
| 473 | Friday the 13th Collection | 12 | 4.648985e+08 |
| 976 | Pokémon Collection | 11 | 6.983008e+08 |
| 552 | Harry Potter Collection | 8 | 7.707367e+09 |
| 540 | Halloween Collection | 8 | 2.471681e+08 |
| 29 | A Nightmare on Elm Street Collection | 8 | 3.635916e+08 |
| 1317 | The Fast and the Furious Collection | 8 | 5.125099e+09 |
| 1432 | The Pink Panther (Original) Collection | 8 | 1.644782e+08 |
| 1160 | Star Wars Collection | 8 | 7.434495e+09 |
| 977 | Police Academy Collection | 7 | 3.046432e+08 |
p = pivotfran_data.sort_values('count', ascending=False)
movies_df.boxplot()
<AxesSubplot:>
fig1 = px.scatter(pivotfran_data.head(25), x="count", y="sum",size="sum", color="belongs_to_collection",
hover_name="count", log_x=True, size_max=60)
fig1.show()
#lets convert to string type
movies_df['title']=movies_df['title'].astype('str')
#movies_df['overview']=movies_df['overview'].astype('str')
#lets join titles continuousy seperated by space
title_data=' '.join(movies_df['title'])
#overview_data=' '.join(movies_df['overview'])
languages = movies_df["original_language"].value_counts()
langues_df = pd.DataFrame({'languages':languages.index, 'frequency':languages.values}).head(10)
fig = px.bar(langues_df, x="frequency", y="languages",color='languages', orientation='h',
hover_data=["languages", "frequency"],
height=1000,
title='Language which has more Movies')
fig.show()
top_movies = movies_df[["title","vote_count"]]
top_movies = top_movies.sort_values(by="vote_count",ascending=False)
fig = px.bar(data_frame=top_movies[:20],x="title",y="vote_count",color="title",title="Most Voted Movies")
fig.show()
s = movies_df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = movies_df.drop('genres', axis=1).join(s)
genre_counts = gen_md.genre.value_counts()
genre_df = pd.DataFrame({'genre':genre_counts.index,"count":genre_counts.values})
fig = px.bar(data_frame=genre_df[:10],x="genre",y="count",color="genre")
fig.show()
movies_df['adult']
0 False
1 False
2 False
3 False
4 False
...
45461 False
45462 False
45463 False
45464 False
45465 False
Name: adult, Length: 45466, dtype: object
adults_count = movies_df['adult'].value_counts()
adults_df = pd.DataFrame({"adults":adults_count.index,"count":adults_count.values})
fig = px.bar(data_frame=adults_df[:2],x="adults",y="count",color="adults")
fig.show()
budget_anlys = movies_df.sort_values(by="budget",ascending=False)
budget_anlys[["budget",'title']].head(10)
| budget | title | |
|---|---|---|
| 17124 | 380000000.0 | Pirates of the Caribbean: On Stranger Tides |
| 11827 | 300000000.0 | Pirates of the Caribbean: At World's End |
| 26558 | 280000000.0 | Avengers: Age of Ultron |
| 11067 | 270000000.0 | Superman Returns |
| 44842 | 260000000.0 | Transformers: The Last Knight |
| 16130 | 260000000.0 | Tangled |
| 18685 | 260000000.0 | John Carter |
| 11780 | 258000000.0 | Spider-Man 3 |
| 21175 | 255000000.0 | The Lone Ranger |
| 22059 | 250000000.0 | The Hobbit: The Desolation of Smaug |